import time
import re
import os
import sys
import codecs
import shutil
import numpy as np
import matplotlib
import scipy
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import math
from sklearn.decomposition import PCA
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

if __name__ == "__main__":
	corpus = []
	file = "result/JXL_20newstopic_100_iter_100_lmd1_0_lmd2_0.1_sparse1_0_sparse2_0-U.txt"
	# for line in open(file, 'r').readlines():
	# 	corpus.append(line.strip())
	# vectorizer = CountVectorizer()
	# transformer = TfidfTransformer()
	# tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
	# word = vectorizer.get_feature_names()
	# weight = tfidf.toarray()
	weight = np.loadtxt(file,delimiter=",")
	print "Features length: " +str(weight.shape)
	tag = np.loadtxt("result/20newsgroup.tag")
	# resName = "BHTfidf_Result.txt"
	# result = codecs.open(resName,'w','utf-8')
	# for j in range(len(word)):
	# 	result.write(word[j] + " ")
	# result.write("\r\n\r\n")
	#
	# for i in range(len(weight)):
	# 	for j in range(len(word)):
	# 		result.write(str(weight[i][j]))
	# 	result.write("\r\n\r\n")
	# result.close()

	#k-means
	print "Start K-Means"
	from sklearn.cluster import KMeans
	clf = KMeans(n_clusters=20)
	s = clf.fit(weight)
	# print(clf.cluster_centers_)

	label = []
	# print clf.labels_
	i = 1
	map = np.zeros((20,20))
	color = cm.rainbow(np.linspace(0,1,20))
	while i<= len(clf.labels_):
		# print i, clf.labels_[i-1]
		label.append(clf.labels_[i-1])
		map[label[i-1]][tag[i-1]] = map[label[i-1]][tag[i-1]]+1
		i = i+1
	# print clf.inertia_

	np.savetxt(file+"map",map,delimiter=',',fmt='%d')
	sum = 0
	# purity
	# for row in map:
	# 	max = row.max()
	# 	total = row.sum()
	# 	sum = sum + max/total
	# print sum/20

	# entropy
	# for row in map:
	# 	total = row.sum();
	# 	row_en = 0
	# 	for item in row:
	# 		if item > 0:
	# 			p = item / total;
	# 			row_en = row_en - p * math.log(p)
	# 	sum = sum + total / map.sum() * row_en
	# print sum

	# mutual information
	r = map.sum(1)
	c = map.sum(0)
	n = map.sum()
	print r,c
	for i in range(20):
		for j in range(20):
			if map[i][j] > 0:
				tmp = map[i][j] / r[j] / c[i] * n
				sum = sum + map[i][j] / n * math.log(tmp, 10)
	print sum

	# van Dongen criterion
	# n = map.sum()
	# sum = sum + 2 * n
	# for row in map:
	# 	sum = sum - row.max()
	# map = map.T
	# for col in map:
	# 	sum = sum - col.max()
	# sum = sum / 2/n
	# print sum